Version: 1.0 (Jupytext, time measurements, logger)
Please put your comments about the notebook functionality here.
import sys
import os
sys.path+=[os.path.join(os.getcwd(), ".."), os.path.join(os.getcwd(), "../..")] # one and two up
ToC
Necessary libraries for notebook functionality:
NOTE: This way, using the function, the button works only in active notebook. If the functionality needs to be preserved in html export, then the code has to be incluced directly into notebook.
from src.utils.notebook_support_functions import create_button, get_notebook_name
from src.utils.logger import Logger
from src.utils.envs import Envs
from src.utils.config import Config
from pandas import options
from IPython.display import display, HTML
Constants for overall behaviour.
LOGGER_CONFIG_NAME = "logger_file_console" # default
PYTHON_CONFIG_NAME = "python_local" # default
CREATE_BUTTON = False
ADDAPT_WIDTH = False
NOTEBOOK_NAME = get_notebook_name()
options.display.max_rows = 500
options.display.max_columns = 500
envs = Envs()
envs.set_logger(LOGGER_CONFIG_NAME)
envs.set_config(PYTHON_CONFIG_NAME)
Logger().start_timer(f"NOTEBOOK; Notebook name: {NOTEBOOK_NAME}")
if CREATE_BUTTON:
create_button()
if ADDAPT_WIDTH:
display(HTML("<style>.container { width:100% !important; }</style>")) # notebook width
A: ../../configurations\logger_file_console.conf 2023-03-21 16:18:42,821 - git.util - DEBUG - Failed checking if running in CYGWIN due to: FileNotFoundError(2, 'The system cannot find the file specified', None, 2, None) 2023-03-21 16:18:42,824 - file_console - DEBUG - Logger was created on WS-3000 in branche 005_cumulative_repository_update. 2023-03-21 16:18:42,824 - file_console - DEBUG - Process: NOTEBOOK; Notebook name: data_frame_explorer_documentation.py; Timer started;
from importlib import reload
from pandas import DataFrame
from numpy.random import choice, randn, seed
import src.data.df_explorer as DFE
# from src.global_constants import * # Remember to import only the constants in use
N_ROWS_TO_DISPLAY = 2
FIGURE_SIZE_SETTING = {"autosize": False, "width": 2200, "height": 750}
config = Config().get_data()
2023-03-21 16:18:42,962 - file_console - DEBUG - Python config was created from python_local.conf file.
df = DataFrame()
n = 100
seed(876)
df["sex"] = choice(["male", "female"], n)
df["number"] = randn(n)
df.head()
| sex | number | |
|---|---|---|
| 0 | male | -0.087944 |
| 1 | female | -1.078677 |
| 2 | female | -1.269129 |
| 3 | female | -0.616576 |
| 4 | female | -0.390710 |
reload(DFE)
df_explorer = DFE.DFExplorer()
df_explorer.print_info_about_data_frame(df=df)
DataFrame type: <class 'pandas.core.frame.DataFrame'>
DataFrame shape: (100, 2)
DataFrame dtypes: {'sex': 'object', 'number': 'float64'}
DataFrame head:
sex number
0 male -0.087944
1 female -1.078677
2 female -1.269129
3 female -0.616576
4 female -0.390710
DataFrame description:
number
count 100.000000
mean -0.109604
std 1.137798
min -3.088058
25% -0.780472
50% -0.143400
75% 0.600610
max 3.264726
df_explorer.get_df_types(df=df)
{'sex': 'object', 'number': 'float64'}
df_explorer.get_memory_usage(df=df, attr_name="number", list_dtypes=["float64", "float32", "float16"])
Memory usage for attribute: number Attribute Name: number Measured dtype: float64 Memory Usage: 928 Attribute Name: number Measured dtype: float32 Memory Usage: 528 Attribute Name: number Measured dtype: float16 Memory Usage: 328
df_explorer.get_nan_stats(df=df, fraction=True)
DataFrame shape: (100, 2) Total number of NaN values: 0 NaN values per Attribute: Attribute NaN values Fraction ----------- ------------ ---------- sex 0 0 number 0 0 TOTAL 0 0
df_explorer.get_nan_stats(df=df, fraction=False)
DataFrame shape: (100, 2) Total number of NaN values: 0 NaN values per Attribute: sex 0 number 0 dtype: int64
df_explorer.print_attr_stats(df=df)
Attribute Name: sex Attribute type: object Number of Null values: 0 Number of unique values is:2 Percentage of unique values is: 0.02 Summation of unique values per ID: male 55 female 45 Name: sex, dtype: int64
############################################# Attribute Name: number Attribute type: float64 Number of Null values: 0 Number of unique values is:100 Percentage of unique values is: 1.0
#############################################
data_1 = [
[1., 2., 3.],
[3., 2., 1.],
[4., 5., 2.]
]
data_2 = [
[1., 2., 3.],
[3., 10., 1.],
[4., 5., 2.]
]
attr_names = ["NUMBER_1", "NUMBER_2", "NUMBER_3"]
df_1 = DataFrame(data_1, columns=attr_names)
df_2 = DataFrame(data_2, columns=attr_names)
# identical data frames
df_explorer.compare_attributes_in_data_frames(df_1, df_1, attr_names)
Are DFs equal in pandas? True Checking Overall Sums for DFs - Sum of First List is: 23.0 - Sum of Second List is: 23.0 - Subtraction of Sums of Lists is: 0.0 - Percentage of Difference (1-2)/1 is: 0.0 - Percentage of Difference (1-2)/2 is: 0.0 Checking for Attribute: NUMBER_1 - Sum of First List is: 8.0 - Sum of Second List is: 8.0 - Subtraction of Sums of Lists is: 0.0 - Percentage of Difference (1-2)/1 is: 0.0 - Percentage of Difference (1-2)/2 is: 0.0 Checking for Attribute: NUMBER_2 - Sum of First List is: 9.0 - Sum of Second List is: 9.0 - Subtraction of Sums of Lists is: 0.0 - Percentage of Difference (1-2)/1 is: 0.0 - Percentage of Difference (1-2)/2 is: 0.0 Checking for Attribute: NUMBER_3 - Sum of First List is: 6.0 - Sum of Second List is: 6.0 - Subtraction of Sums of Lists is: 0.0 - Percentage of Difference (1-2)/1 is: 0.0 - Percentage of Difference (1-2)/2 is: 0.0
# not identical data frames
df_explorer.compare_attributes_in_data_frames(df_1, df_2, attr_names)
Are DFs equal in pandas? False Checking Overall Sums for DFs - Sum of First List is: 23.0 - Sum of Second List is: 31.0 - Subtraction of Sums of Lists is: -8.0 - Percentage of Difference (1-2)/1 is: -0.34782608695652173 - Percentage of Difference (1-2)/2 is: -0.25806451612903225 Checking for Attribute: NUMBER_1 - Sum of First List is: 8.0 - Sum of Second List is: 8.0 - Subtraction of Sums of Lists is: 0.0 - Percentage of Difference (1-2)/1 is: 0.0 - Percentage of Difference (1-2)/2 is: 0.0 Checking for Attribute: NUMBER_2 - Sum of First List is: 9.0 - Sum of Second List is: 17.0 - Subtraction of Sums of Lists is: -8.0 - Percentage of Difference (1-2)/1 is: -0.8888888888888888 - Percentage of Difference (1-2)/2 is: -0.47058823529411764 Checking for Attribute: NUMBER_3 - Sum of First List is: 6.0 - Sum of Second List is: 6.0 - Subtraction of Sums of Lists is: 0.0 - Percentage of Difference (1-2)/1 is: 0.0 - Percentage of Difference (1-2)/2 is: 0.0
Logger().end_timer()
2023-03-21 16:18:43,415 - file_console - DEBUG - Process: NOTEBOOK; Notebook name: data_frame_explorer_documentation.py; Timer ended; Process Duration [s]: 0.59; Process Duration [m]: 0.01